import os
import multiprocessing
from datasets import load_from_disk

num_processes = multiprocessing.cpu_count()


def process_func(example):
    text = "\n".join([dic["内容"] for dic in example['段落']])
    return {'段落': text}


path = "data/mnbvc/peoples_daily"
dataset = load_from_disk(path)
print(dataset)
dataset = dataset["train"]

new_dataset = dataset.map(lambda example: process_func(example), remove_columns=dataset.column_names, num_proc=num_processes)
print(new_dataset)


save_path = "data/processed/peoples_daily.txt"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
with open(save_path, "w") as f:
    for line in new_dataset["段落"]:
        f.write(line.strip() + "\n")
